Missing Values Imputer

Import Packages


In [8]:
import pandas as pd 
from autoc.explorer import cserie,DataExploration
from autoc.utils.helpers import *
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline  
matplotlib.style.use('ggplot')
import seaborn as sns 
plt.rcParams['figure.figsize'] = (12.0, 8)
np.random.seed(0)

Give me some Credit data


In [9]:
# Load Give me Some credit
path = '/Users/ericfourrier/Documents/Data/Give_Me_Some_Credit/cs-training.csv'
df_train = pd.read_csv(path)

In [10]:
# if you prefer to work wit hdatabase 
# from sqlalchemy import create_engine
# engine = create_engine('sqlite://')
# df_train.to_sql('cstraining',engine)
# engine.table_names()
# test = pd.read_sql("select * from cstraining",engine)

In [11]:
df_train.head(10)


Out[11]:
SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age NumberOfTime30-59DaysPastDueNotWorse DebtRatio MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse NumberOfDependents
0 1 0.766127 45 2 0.802982 9120 13 0 6 0 2
1 0 0.957151 40 0 0.121876 2600 4 0 0 0 1
2 0 0.658180 38 1 0.085113 3042 2 1 0 0 0
3 0 0.233810 30 0 0.036050 3300 5 0 0 0 0
4 0 0.907239 49 1 0.024926 63588 7 0 1 0 0
5 0 0.213179 74 0 0.375607 3500 3 0 1 0 1
6 0 0.305682 57 0 5710.000000 NaN 8 0 3 0 0
7 0 0.754464 39 0 0.209940 3500 8 0 0 0 0
8 0 0.116951 27 0 46.000000 NaN 2 0 0 0 NaN
9 0 0.189169 57 0 0.606291 23684 9 0 4 0 2

In [12]:
df_train.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 150000 entries, 0 to 149999
Data columns (total 11 columns):
SeriousDlqin2yrs                        150000 non-null int64
RevolvingUtilizationOfUnsecuredLines    150000 non-null float64
age                                     150000 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    150000 non-null int64
DebtRatio                               150000 non-null float64
MonthlyIncome                           120269 non-null float64
NumberOfOpenCreditLinesAndLoans         150000 non-null int64
NumberOfTimes90DaysLate                 150000 non-null int64
NumberRealEstateLoansOrLines            150000 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    150000 non-null int64
NumberOfDependents                      146076 non-null float64
dtypes: float64(4), int64(7)
memory usage: 13.7 MB

In [13]:
df_train.groupby('NumberOfDependents').mean()


Out[13]:
SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age NumberOfTime30-59DaysPastDueNotWorse DebtRatio MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse
NumberOfDependents
0 0.058629 5.666405 54.396642 0.423247 418.946560 5873.411467 8.158052 0.291892 0.905641 0.268038
1 0.073529 4.852905 51.851117 0.365405 174.621397 7205.172361 9.175179 0.185553 1.143829 0.156901
2 0.081139 7.710358 46.460609 0.413636 242.657069 7576.163446 9.031657 0.205102 1.263754 0.182307
3 0.088263 9.341881 45.901086 0.361489 228.205045 8396.601029 8.938416 0.163239 1.267637 0.126542
4 0.103774 1.909806 45.562544 0.442348 212.753479 8691.199398 8.966108 0.205101 1.282320 0.171209
5 0.091153 0.393077 46.151475 0.461126 198.931910 9260.174165 8.979893 0.262735 1.286863 0.231903
6 0.151899 0.437257 47.075949 0.341772 126.818586 8686.493333 8.753165 0.139241 1.202532 0.120253
7 0.098039 0.401806 48.392157 0.372549 144.332806 8934.653061 8.196078 0.137255 1.235294 0.156863
8 0.083333 0.332304 49.541667 0.166667 88.093734 8312.954545 7.416667 0.166667 1.250000 0.000000
9 0.000000 0.351158 45.800000 0.000000 718.681982 8538.750000 9.800000 0.200000 1.000000 0.000000
10 0.000000 0.112469 46.600000 0.400000 724.313041 6879.000000 7.400000 0.000000 0.400000 0.000000
13 0.000000 1.000000 53.000000 1.000000 0.230054 3333.000000 3.000000 1.000000 1.000000 0.000000
20 0.000000 0.226616 40.000000 1.000000 0.576539 6316.000000 11.000000 2.000000 1.000000 0.000000

In [14]:
exploration = DataExploration(df_train)

In [15]:
exploration.structure()


Out[15]:
dtypes_p dtypes_r nb_missing perc_missing nb_unique_values constant_columns na_columns is_key
SeriousDlqin2yrs int64 numeric 0 0.000000 2 False False False
RevolvingUtilizationOfUnsecuredLines float64 numeric 0 0.000000 125728 False False False
age int64 numeric 0 0.000000 86 False False False
NumberOfTime30-59DaysPastDueNotWorse int64 numeric 0 0.000000 16 False False False
DebtRatio float64 numeric 0 0.000000 114194 False False False
MonthlyIncome float64 numeric 29731 0.198207 13594 False False False
NumberOfOpenCreditLinesAndLoans int64 numeric 0 0.000000 58 False False False
NumberOfTimes90DaysLate int64 numeric 0 0.000000 19 False False False
NumberRealEstateLoansOrLines int64 numeric 0 0.000000 28 False False False
NumberOfTime60-89DaysPastDueNotWorse int64 numeric 0 0.000000 13 False False False
NumberOfDependents float64 numeric 3924 0.026160 13 False False False

In [16]:
exploration.nearzerovar()


                                      freq_ratio    nzv  percent_unique  \
SeriousDlqin2yrs                       13.961101  False        0.001333   
RevolvingUtilizationOfUnsecuredLines    1.060647  False       83.818667   
age                                     1.008145  False        0.057333   
NumberOfTime30-59DaysPastDueNotWorse    7.859914  False        0.010667   
DebtRatio                              17.960699  False       76.129333   
MonthlyIncome                           1.309117  False        9.062667   
NumberOfOpenCreditLinesAndLoans         1.027860  False        0.038667   
NumberOfTimes90DaysLate                27.019264   True        0.012667   
NumberRealEstateLoansOrLines            1.073560  False        0.018667   
NumberOfTime60-89DaysPastDueNotWorse   24.846624   True        0.008667   
NumberOfDependents                      3.302250  False        0.008667   

                                     zero_var  
SeriousDlqin2yrs                        False  
RevolvingUtilizationOfUnsecuredLines    False  
age                                     False  
NumberOfTime30-59DaysPastDueNotWorse    False  
DebtRatio                               False  
MonthlyIncome                           False  
NumberOfOpenCreditLinesAndLoans         False  
NumberOfTimes90DaysLate                 False  
NumberRealEstateLoansOrLines            False  
NumberOfTime60-89DaysPastDueNotWorse    False  
NumberOfDependents                      False  
Out[16]:
Index([u'NumberOfTimes90DaysLate', u'NumberOfTime60-89DaysPastDueNotWorse'], dtype='object')

In [17]:
exploration.findcorr()


Out[17]:
['NumberOfTime60-89DaysPastDueNotWorse',
 'NumberOfTime30-59DaysPastDueNotWorse']

Bin true numeric values


In [18]:
df_train_categoric = df_train.copy()

In [19]:
# Bin true numeric columns 
cols_to_bin = ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio', 'MonthlyIncome']
nb_quantiles = 10
for col in cols_to_bin:
    df_train_categoric.loc[:, col] = pd.qcut(df_train_categoric.loc[:,col],nb_quantiles).astype('str')

In [20]:
#Transform ervery variables to Categorical type of pandas 
# fix problem with category variable
# df_train_categoric = df_train_categoric.apply(lambda x: x.astype('str'),axis = 0)

In [21]:
df_train_categoric.dtypes


Out[21]:
SeriousDlqin2yrs                          int64
RevolvingUtilizationOfUnsecuredLines     object
age                                       int64
NumberOfTime30-59DaysPastDueNotWorse      int64
DebtRatio                                object
MonthlyIncome                            object
NumberOfOpenCreditLinesAndLoans           int64
NumberOfTimes90DaysLate                   int64
NumberRealEstateLoansOrLines              int64
NumberOfTime60-89DaysPastDueNotWorse      int64
NumberOfDependents                      float64
dtype: object

In [22]:
df_train_categoric.describe()


Out[22]:
SeriousDlqin2yrs age NumberOfTime30-59DaysPastDueNotWorse NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse NumberOfDependents
count 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 146076.000000
mean 0.066840 52.295207 0.421033 8.452760 0.265973 1.018240 0.240387 0.757222
std 0.249746 14.771866 4.192781 5.145951 4.169304 1.129771 4.155179 1.115086
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 41.000000 0.000000 5.000000 0.000000 0.000000 0.000000 0.000000
50% 0.000000 52.000000 0.000000 8.000000 0.000000 1.000000 0.000000 0.000000
75% 0.000000 63.000000 0.000000 11.000000 0.000000 2.000000 0.000000 1.000000
max 1.000000 109.000000 98.000000 58.000000 98.000000 54.000000 98.000000 20.000000

In [23]:
df_simu = df_train_categoric.copy()

In [24]:
ec = DataExploration(df_simu)

In [25]:
def simulate_na_col(df, colname, n=None, pct=None, weights=None,
                    safety=True, *args, **kwargs):
    """ Simulate missing values in a column of categorical variables

    Notes
    -----
    Fix issue with category variable"""
    # if df.loc[:,colname].dtype == 'float' or df.loc[:,colname].dtype == 'int':
    #     raise ValueError('This function only support categorical variables')
    if (n is None) and (pct is not None):
        # be careful here especially if cols has a lot of missing values
        n = int(pct * df.shape[0])
    if isinstance(colname, pd.core.index.Index) or isinstance(colname, list):
        for c in colname:
            simulate_na_col(df, colname=c, n=n, pct=pct, weights=weights)
    else:
        if safety:
            tokeep = keep_category(df, colname, *args, **kwargs)
        col = df.loc[:, colname].drop(tokeep)  # we are not smapling from tokeep
        col = col.dropna()
        print(colname)
        col_distribution = col.value_counts(normalize=True, sort=False)
        labels = col_distribution.index  # characters
        # generate random pmf
        pmf_na = weights if weights else random_pmf(len(labels))
        na_distribution = pd.Series(data=pmf_na, index=labels)
        # draw samples from this pmf
        weights_na = col.apply(lambda x: na_distribution[x])
        weights_na /= weights_na.sum()
        index_to_replace = col.sample(
            n=n, weights=weights_na, replace=False).index
        df.loc[index_to_replace, colname] = np.nan

In [26]:
# fix problem with category variable
simulate_na_col(df_simu,list(df_train_categoric.columns),n=80000)


SeriousDlqin2yrs
RevolvingUtilizationOfUnsecuredLines
age
NumberOfTime30-59DaysPastDueNotWorse
DebtRatio
MonthlyIncome
NumberOfOpenCreditLinesAndLoans
NumberOfTimes90DaysLate
NumberRealEstateLoansOrLines
NumberOfTime60-89DaysPastDueNotWorse
NumberOfDependents

In [27]:
df_simu.isnull().sum(axis = 0)


Out[27]:
SeriousDlqin2yrs                        80000
RevolvingUtilizationOfUnsecuredLines    80000
age                                     80000
NumberOfTime30-59DaysPastDueNotWorse    80000
DebtRatio                               80000
MonthlyIncome                           80000
NumberOfOpenCreditLinesAndLoans         80000
NumberOfTimes90DaysLate                 80000
NumberRealEstateLoansOrLines            80000
NumberOfTime60-89DaysPastDueNotWorse    80000
NumberOfDependents                      83924
dtype: int64

In [309]:
df_simu.hist()


Out[309]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x13e09acd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x13e14bc50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x13e346f90>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x13e3a9e50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x13e4381d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x13e3e3690>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x13e737a90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x13e7bc7d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x13f020f10>]], dtype=object)

In [310]:
df_train_categoric.hist()


Out[310]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x13f1134d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x13f622310>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x13f6a0350>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x13f703690>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x13f7868d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x13f72ffd0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x141fd32d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x142291fd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x142303750>]], dtype=object)

In [308]:
df_train.dtypes


Out[308]:
SeriousDlqin2yrs                          int64
RevolvingUtilizationOfUnsecuredLines    float64
age                                       int64
NumberOfTime30-59DaysPastDueNotWorse      int64
DebtRatio                               float64
MonthlyIncome                           float64
NumberOfOpenCreditLinesAndLoans           int64
NumberOfTimes90DaysLate                   int64
NumberRealEstateLoansOrLines              int64
NumberOfTime60-89DaysPastDueNotWorse      int64
NumberOfDependents                      float64
dtype: object

In [221]:
kl_series(df_simu.SeriousDlqin2yrs,df_train_categoric.SeriousDlqin2yrs)


Out[221]:
0.0014179270382744264

In [222]:
df_simu.SeriousDlqin2yrs.dropna().value_counts(normalize=True).values


Out[222]:
array([ 0.91946,  0.08054])

In [223]:
df_train_categoric.SeriousDlqin2yrs.value_counts(normalize=True).values


Out[223]:
array([ 0.93316,  0.06684])

In [224]:
kl(df_train_categoric.SeriousDlqin2yrs.value_counts(normalize=True).values,df_simu.SeriousDlqin2yrs.dropna().value_counts(normalize=True).values)


Out[224]:
0.0013390936562772265

In [311]:
for col in df_simu.columns:
    try : 
        print("Kullback-Leibler divergence between both distribution: {}".format(
                kl_series(df_simu.loc[:,col],df_train_categoric.loc[:,col])))
    except Exception as e:
        print('error:{}'.format(e))


Kullback-Leibler divergence between both distribution: 0.00704095392282
Kullback-Leibler divergence between both distribution: 0.147324050536
Kullback-Leibler divergence between both distribution: 0.0463178838332
Kullback-Leibler divergence between both distribution: 0.0111943913358
Kullback-Leibler divergence between both distribution: 0.128988756617
Kullback-Leibler divergence between both distribution: 0.055585472736
Kullback-Leibler divergence between both distribution: 0.0112393151998
Kullback-Leibler divergence between both distribution: 0.0176357749221
Kullback-Leibler divergence between both distribution: 0.0311527559851
Kullback-Leibler divergence between both distribution: 0.00528514754948
Kullback-Leibler divergence between both distribution: 0.0230901204554

NaImputer class test


In [1]:
from autoc import NaImputer, missing_map

In [31]:
missing_map(df_simu,nmax=1000) # no pattern visible


Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x121e66bd0>

In [28]:
na = NaImputer(df_simu)

In [32]:
na.corrplot_na() # totally missing at random



In [33]:
na.infos_na()


Out[33]:
{'low_na_col': [],
 'many_na_col': [],
 'nacolcount':                                       Nanumber  Napercentage
 SeriousDlqin2yrs                         80000      0.533333
 RevolvingUtilizationOfUnsecuredLines     80000      0.533333
 age                                      80000      0.533333
 NumberOfTime30-59DaysPastDueNotWorse     80000      0.533333
 DebtRatio                                80000      0.533333
 MonthlyIncome                            80000      0.533333
 NumberOfOpenCreditLinesAndLoans          80000      0.533333
 NumberOfTimes90DaysLate                  80000      0.533333
 NumberRealEstateLoansOrLines             80000      0.533333
 NumberOfTime60-89DaysPastDueNotWorse     80000      0.533333
 NumberOfDependents                       83924      0.559493,
 'narowcount':         Nanumber  Napercentage
 0              9      0.000060
 1              5      0.000033
 2              6      0.000040
 3              5      0.000033
 4              7      0.000047
 5              6      0.000040
 6              6      0.000040
 7              5      0.000033
 8              6      0.000040
 9              6      0.000040
 10             4      0.000027
 11             6      0.000040
 12             6      0.000040
 13             4      0.000027
 14             7      0.000047
 15             4      0.000027
 16             7      0.000047
 17             4      0.000027
 18             6      0.000040
 19             5      0.000033
 20             7      0.000047
 21             5      0.000033
 22             7      0.000047
 23             5      0.000033
 24             6      0.000040
 25             7      0.000047
 26             4      0.000027
 27             9      0.000060
 28             6      0.000040
 29             6      0.000040
 ...          ...           ...
 149970         6      0.000040
 149971         5      0.000033
 149972         7      0.000047
 149973         8      0.000053
 149974         8      0.000053
 149975         7      0.000047
 149976         3      0.000020
 149977         6      0.000040
 149978         2      0.000013
 149979         3      0.000020
 149980         7      0.000047
 149981         6      0.000040
 149982         7      0.000047
 149983         4      0.000027
 149984         5      0.000033
 149985         4      0.000027
 149986         4      0.000027
 149987         5      0.000033
 149988         6      0.000040
 149989         6      0.000040
 149990         7      0.000047
 149991         7      0.000047
 149992         7      0.000047
 149993         7      0.000047
 149994         4      0.000027
 149995         4      0.000027
 149996         7      0.000047
 149997        11      0.000073
 149998         6      0.000040
 149999         6      0.000040
 
 [150000 rows x 2 columns],
 'nb_total_na': 883924,
 'total_pct_na': 0.5357115151515152}

Prediction using skicit learn

Structure of the data


In [226]:
ec.structure()


Out[226]:
dtypes_p dtypes_r nb_missing perc_missing nb_unique_values constant_columns na_columns is_key
SeriousDlqin2yrs float64 numeric 50000 0.333333 2 False False False
RevolvingUtilizationOfUnsecuredLines object factor 50000 0.333333 10 False False False
age float64 numeric 50000 0.333333 86 False False False
NumberOfTime30-59DaysPastDueNotWorse float64 numeric 50000 0.333333 16 False False False
DebtRatio object factor 50000 0.333333 10 False False False
MonthlyIncome object character 50000 0.333333 11 False False False
NumberOfOpenCreditLinesAndLoans float64 numeric 50000 0.333333 58 False False False
NumberOfTimes90DaysLate float64 numeric 50000 0.333333 19 False False False
NumberRealEstateLoansOrLines float64 numeric 50000 0.333333 28 False False False
NumberOfTime60-89DaysPastDueNotWorse float64 numeric 50000 0.333333 13 False False False
NumberOfDependents float64 numeric 53924 0.359493 13 False False False

Cleaning


In [231]:
# Dirty cleaning 
df_simu.loc[df_simu.NumberOfOpenCreditLinesAndLoans >=10,'NumberOfOpenCreditLinesAndLoans'] =10
df_simu.loc[df_simu.NumberRealEstateLoansOrLines >=5,'NumberRealEstateLoansOrLines'] = 5
df_simu.loc[df_simu.NumberOfTimes90DaysLate >=5,'NumberOfTimes90DaysLate'] = 5
df_simu.loc[df_simu.NumberOfTimes90DaysLate >=5,'NumberOfTimes90DaysLate'] = 5

Transformation to discrete variables


In [243]:
df_simu.age = pd.qcut(df_simu.age,10)

In [244]:
df_simu.age.value_counts()


Out[244]:
(32, 39]     11365
(56, 61]     11036
(43, 47]     10684
(61, 66]     10607
[0, 32]      10218
(39, 43]     10133
(47, 51]      9108
(72, 109]     9031
(66, 72]      9009
(51, 56]      8809
dtype: int64

In [245]:
DataExploration(df_simu).structure()


Out[245]:
dtypes_p dtypes_r nb_missing perc_missing nb_unique_values constant_columns na_columns is_key
SeriousDlqin2yrs float64 numeric 50000 0.333333 2 False False False
RevolvingUtilizationOfUnsecuredLines object factor 50000 0.333333 10 False False False
age category factor 50000 0.333333 10 False False False
NumberOfTime30-59DaysPastDueNotWorse float64 numeric 50000 0.333333 16 False False False
DebtRatio object factor 50000 0.333333 10 False False False
MonthlyIncome object character 50000 0.333333 11 False False False
NumberOfOpenCreditLinesAndLoans float64 numeric 50000 0.333333 11 False False False
NumberOfTimes90DaysLate float64 numeric 50000 0.333333 6 False False False
NumberRealEstateLoansOrLines float64 numeric 50000 0.333333 6 False False False
NumberOfTime60-89DaysPastDueNotWorse float64 numeric 50000 0.333333 13 False False False
NumberOfDependents float64 numeric 53924 0.359493 13 False False False

Slkicit learn age imputation


In [271]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
test = pd.get_dummies(df_simu[df_simu.age.isnull()].drop('age',axis =1).fillna('mean')).as_matrix()
X = pd.get_dummies(df_simu[df_simu.age.notnull()].drop('age',axis =1).fillna('mean')).as_matrix()
y = df_simu.age[df_simu.age.notnull()].values
clf.fit(X, y)


Out[271]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [275]:
X.shape


Out[275]:
(100000, 106)

In [277]:
# train prediction
X.shape
clf.predict(X)


Out[277]:
array(['(43, 47]', '[0, 32]', '[0, 32]', ..., '(61, 66]', '(72, 109]',
       '(43, 47]'], 
      dtype='|S9')

In [278]:
clf.predict_proba(X)


Out[278]:
array([[  1.83289771e-01,   2.19381391e-01,   2.90085412e-01, ...,
          6.84279563e-04,   6.03975113e-05,   1.66190304e-02],
       [  1.40230809e-01,   5.42521369e-02,   3.86441449e-02, ...,
          3.34233046e-02,   3.84214041e-02,   5.91624745e-01],
       [  9.36480784e-02,   3.93464423e-02,   3.11406469e-02, ...,
          6.04516585e-03,   5.99939570e-03,   7.68079512e-01],
       ..., 
       [  2.64899184e-02,   3.15004954e-02,   4.77347598e-02, ...,
          1.90957938e-01,   8.03141975e-02,   2.86331036e-03],
       [  1.35909450e-01,   9.31697184e-02,   7.90661232e-02, ...,
          1.14578204e-01,   1.71856434e-01,   1.31061988e-01],
       [  1.35726641e-01,   1.68230766e-01,   2.02803346e-01, ...,
          3.70781645e-02,   7.03995712e-03,   1.25880481e-02]])

In [279]:
clf.score(X, y, sample_weight=None)


Out[279]:
0.2114

In [282]:
res= np.array([clf.predict(X),y]).T

In [283]:
res


Out[283]:
array([['(43, 47]', '(43, 47]'],
       ['[0, 32]', '(39, 43]'],
       ['[0, 32]', '(32, 39]'],
       ..., 
       ['(61, 66]', '(56, 61]'],
       ['(72, 109]', '[0, 32]'],
       ['(43, 47]', '(61, 66]']], dtype=object)

In [273]:
# test prediction
test.shape
#clf.predict(test)


Out[273]:
(50000, 103)

In [251]:
y


Out[251]:
[(43, 47], (39, 43], (32, 39], [0, 32], (56, 61], ..., [0, 32], (43, 47], (56, 61], [0, 32], (61, 66]]
Length: 100000
Categories (10, object): [[0, 32] < (32, 39] < (39, 43] < (43, 47] ... (56, 61] < (61, 66] < (66, 72] < (72, 109]]

In [ ]:
df_simu.drop('age',axis =1)

In [ ]:
df_simu.age.isnull()

In [182]:
df_simu.age


Out[182]:
0               nan
1          (39, 50]
2               nan
3           [0, 39]
4               nan
5               nan
6               nan
7               nan
8               nan
9          (50, 57]
10          [0, 39]
11         (50, 57]
12         (39, 50]
13         (39, 50]
14              nan
15              nan
16        (65, 109]
17              nan
18              nan
19          [0, 39]
20         (39, 50]
21              nan
22              nan
23              nan
24         (57, 65]
25         (39, 50]
26         (57, 65]
27        (65, 109]
28          [0, 39]
29         (57, 65]
            ...    
149970     (57, 65]
149971          nan
149972     (39, 50]
149973     (39, 50]
149974     (57, 65]
149975     (57, 65]
149976          nan
149977      [0, 39]
149978     (50, 57]
149979          nan
149980     (57, 65]
149981     (39, 50]
149982      [0, 39]
149983          nan
149984    (65, 109]
149985      [0, 39]
149986          nan
149987      [0, 39]
149988      [0, 39]
149989     (57, 65]
149990          nan
149991          nan
149992     (39, 50]
149993      [0, 39]
149994     (39, 50]
149995    (65, 109]
149996     (39, 50]
149997     (57, 65]
149998          nan
149999     (57, 65]
Name: age, dtype: object